;; This file is part of scheme-GNUnet, a Scheme port of GNUnet .
;;  Copyright (C) 2010, 2016, 2017 GNUnet e.V.
;;  Copyright (C) 2021 Maxime Devos <maximedevos@telenet.be>
;;
;; GNUnet is free software: you can redistribute it and/or modify it
;; under the terms of the GNU Affero General Public License as published
;; by the Free Software Foundation, either version 3 of the License,
;;  or (at your option) any later version.
;;
;; GNUnet is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;; Affero General Public License for more details.
;;
;; You should have received a copy of the GNU Affero General Public License
;;  along with this program.  If not, see <http://www.gnu.org/licenses/>.
;;
;; SPDX-License-Identifier: AGPL3.0-or-later

;; C file: util/mst.c
;; Brief: convenience functions for handling inbound message buffers
;; Author: Christian Grothoff
;; Adapted to Scheme by Maxime Devos
;;
;; The most prominent use would be the implementation of message queues
;; over stream sockets, where the separate messages need to be split
;; from each other.  However, it is used in some other places as
;; well.
;;
;; The Scheme implementation does not support the 'purge' and 'one-shot'
;; modes. 'purge' should be simple to implement though and 'one-shot'
;; could be implemented with delimited continuations.
;;
;; The implementation avoids copying when possible.

(define-library (gnu gnunet utils tokeniser)
  (export make-tokeniser
	  tokeniser?

	  &interrupted-tokeniser-violation
	  make-interrupted-tokeniser-violation
	  interrupted-tokeniser-violation?

	  &kaput-tokeniser-error
	  make-kaput-tokeniser-error
	  kaput-tokeniser-error?

	  add-bytevector!
	  add-from-port!)
  (import (only (rnrs base)
		define and < assert begin quote lambda
		>= integer? exact? <= expt = cond
		let + - eq? > * min if)
	  (only (rnrs conditions)
		define-condition-type condition make-who-condition
		&violation &error)
	  (only (rnrs exceptions)
		raise)
	  (only (rnrs bytevectors)
		endianness bytevector-copy! bytevector? bytevector-length
		bytevector-u8-ref bytevector-u8-set! bytevector-u16-ref
		bytevector-u16-set! make-bytevector)
	  (only (rnrs records syntactic)
		define-record-type)
	  (only (srfi srfi-26) cut)
	  (only (guile) lambda*)
	  (only (ice-9 binary-ports) get-bytevector-some!)
	  (only (ice-9 ports) eof-object?)
	  (only (gnu gnunet util struct)
		/:message-header)
	  (only (gnu gnunet netstruct syntactic)
		sizeof)
	  (only (gnu gnunet utils hat-let) let^))
  (begin
    (define-condition-type &interrupted-tokeniser-violation &violation
      make-interrupted-tokeniser-violation interrupted-tokeniser-violation?)
    (define-condition-type &kaput-tokeniser-error &error
      make-kaput-tokeniser-error kaput-tokeniser-error?)

    (define-record-type
	(<tokeniser> make-tokeniser tokeniser-state?)
      ;; Current buffer.
      ;;
      ;; Alternatively, when @code{add-bytevector-copy!} is
      ;; being called, this is temporarily set to @code{#f},
      ;; to detect re-entrancy. And if a message with size
      ;; less than the message header is found, then it is set
      ;; to @code{#t}, marking the tokeniser as ‘kaput’.
      (fields (mutable buffer tokeniser-buffer set-tokeniser-buffer!)
	      ;; Number of bytes in the buffer.
	      (mutable position tokeniser-position set-tokeniser-position!))
      (protocol
       (lambda (%make)
	 (lambda* (#:key
		   (initial-size (sizeof /:message-header '())))
	   "Make an empty tokeniser.  A buffer of size @var{initial-size}
will be pre-allocated.  This size must be an exact natural and it
might be adjusted."
	   (assert (and (integer? initial-size) (exact? initial-size)
			(>= initial-size 0)))
	   (%make (make-bytevector
		   (cond ((<= initial-size (sizeof /:message-header '()))
			  (sizeof /:message-header '()))
			 ((>= initial-size (expt 2 16))
			  (expt 2 16))
			 (#t initial-size)))
		  0))))
      (opaque #t)
      (sealed #t))

    (define (add-bytevector! tok bv offset length
			     handle/message
			     return/done
			     return/overly-small)
      "Feed up to @var{length} bytes from the bytevector @var{bv}
starting at @var{offset} to the tokeniser @var{tok}.

When a complete message is assembled, the callback @var{handle/message}
is called with an appropriate bytevector region.  This bytevector region
is part of the passed bytevector range (@var{bv}, @var{offset}, @var{length})
or the tokeniser's internal buffer.

If a message size was overly small, i.e., smaller than its header,
then @var{return/overly-small} is called in tail position with the
specified message type (as an integer) and message size.  In that case,
@var{tok} will be marked as kaput.  As the message type is not always
available, sometimes @code{#false} will be pased instead.

On success, @code{return/done} is called in tail position without
arguments.

This procedure may only be called if @var{tok} isn't kaput,
and it may not be called re-entrantly.  In the former case,
a @code{&kaput-tokeniser-error} is raised.  In the latter case,
a @code{&interrupted-tokeniser-violation} may be raised
but this cannot be guaranteed."
      ;; ^ mainly due to parallelism reasons
      (define set-buffer! (cut set-tokeniser-buffer! tok <>))
      (define set-position! (cut set-tokeniser-position! tok <>))
      (define mark-kaput! (cut set-tokeniser-buffer! tok #t))
      (define (maybe-reallocate/no-move buffer minimal-size)
	"Return a fresh bytevector or the bytevector @var{buffer} of
at least size @var{minimal-size}.  Avoid allocations."
	(if (<= minimal-size (bytevector-length buffer))
	    buffer
	    (make-bytevector minimal-size)))
      ;; Possibilities:
      ;;  (a) @var{length} is zero. Then there's nothing to do!
      ;;      The other possibilities will assume @var{length}
      ;;      is at least one.
      ;;
      ;;  (b) If the tokeniser buffer is empty and @var{bv} starts
      ;;      with a complete message, then call the processor
      ;;      on the message and continue.
      ;;
      ;;  (c) If the tokeniser buffer is empty and @var{bv} starts
      ;;      with an incomplete message, then copy the partial message
      ;;      to the tokeniser buffer (reallocating it if necessary)
      ;;      and stop.
      ;;
      ;;      If the message size is known, it is considered necessary
      ;;      for the tokeniser buffer to be at least that size.
      ;;
      ;;  (d) If the tokeniser buffer is non-empty and the message size
      ;;      of the partial message in that buffer is unknown,
      ;;      then determine the size, if necessary reallocate the tokeniser
      ;;      buffer to be at least that size, copy the size into the
      ;;      buffer and continue.
      ;;
      ;;      The message won't be complete yet, as a 'type' field always
      ;;      comes after the 'size' field in the message header.
      ;;
      ;;  (e) If the tokeniser buffer is non-empty and the message size
      ;;      of the partial message in that buffer is known,
      ;;      then copy the remainder of the message into the tokeniser
      ;;      buffer (as far as possible).
      ;;
      ;;      If this makes the message complete, then process the message
      ;;      and continue.  If the message isn't complete, then just stop,
      ;;      as all of @var{bv} has been copied.
      (define (continue buffer position bv offset length)
	;; The buffer is set before calls to return/done or return/overly-small.
	;; The position is set after it changes and before the tail-iteration
	;; into 'continue'.
	(cond ((= length 0)
	       (set-buffer! buffer)
	       (return/done)) ; possibility (a)
	      ((and (= position 0)
		    ;; possibility (c), length unknown
		    (< length 2))
	       (bytevector-u8-set! buffer position
				   (bytevector-u8-ref bv offset))
	       (set-buffer! buffer)
	       (set-position! 1)
	       (return/done))
	      ((= position 0) ; and (>= length 2)
	       (let ((size (bytevector-u16-ref bv offset (endianness big))))
		 (cond ((< size (sizeof /:message-header '()))
			(mark-kaput!)
			(return/overly-small
			 (and (>= length 4)
			      ;; + 2: skip the "size" field and read
			      ;; the 'type' field
			      (bytevector-u16-ref bv (+ offset 2)
						  (endianness big)))
			 size))
		       ;; possibility (b)
		       ((<= size length)
			(handle/message bv offset size)
			(continue buffer position bv (+ offset size)
				  (- length size)))
		       ;; Now, (< length size) -- possibility (c).
		       (#t
			(let ((buffer
			       ;; Re-allocate the buffer if required.
			       (maybe-reallocate/no-move buffer size)))
			  ;; Write the partial message to the buffer
			  (bytevector-copy! bv offset buffer 0 length)
			  (set-buffer! buffer)
			  (set-position! (+ position length))
			  (return/done))))))
	      ((>= position 2) ; possibility (e)
	       (let^ ((! size (bytevector-u16-ref buffer 0 (endianness big)))
		      (!! (<= (sizeof /:message-header '()) size))
		      (!! (<= size (bytevector-length buffer)))
		      (!! (< position size))
		      ;; How many bytes must be copied?
		      (! extra (min length (- size position)))
		      ;; Copy the bytes.
		      (_ (bytevector-copy! bv offset buffer position extra))
		      (! position (+ position extra))
		      (!! (<= position size))
		      ;; do not set the buffer yet, such that
		      ;; re-entrancy from the 'handle/message' callback
		      ;; can be detected.
		      (? (< position size)
			 ;; Message is not yet complete --> stop.
			 (assert (= length extra))
			 (set-buffer! buffer)
			 ;; some bytes have been copied
			 (set-position! position)
			 (return/done)))
		     ;; Message is complete --> process it and continue
		     ;; (there may be other messages as well!)
		     (handle/message buffer 0 size)
		     (set-position! 0)
		     (continue buffer 0 bv (+ offset extra) (- length extra))))
	      ;; (< position 2), possibility (d)
	      (#t
	       (let^ ((! size/byte-0 (bytevector-u8-ref buffer 0))
		      (! size/byte-1 (bytevector-u8-ref bv offset))
		      (! size (+ (* (expt 2 8) size/byte-0)
				 size/byte-1))
		      (? (< size (sizeof /:message-header '()))
			 (mark-kaput!)
			 (return/overly-small
			  (and (>= length 3)
			       (bytevector-u16-ref bv (+ offset 1)
						   (endianness big)))
			  size))
		      (! buffer (maybe-reallocate/no-move buffer size)))
		     (bytevector-u16-set! buffer 0 size (endianness big))
		     (set-position! 2)
		     (continue buffer 2 bv (+ offset 1) (- length 1))))))
      (let^ ((! buffer (tokeniser-buffer tok))
	     (! position (tokeniser-position tok))
	     (? (eq? buffer #t)
		(raise (condition
			(make-who-condition 'add-bytevector!)
			(make-kaput-tokeniser-error))))
	     (? (eq? buffer #f)
		(raise (condition
			(make-who-condition 'add-bytevector!)
			(make-interrupted-tokeniser-violation))))
	     (!! (and (bytevector? buffer)
		      (integer? position)
		      (exact? position)
		      (integer? offset)
		      (exact? offset)
		      (integer? length)
		      (exact? length)
		      (<= (+ offset length) (bytevector-length bv))
		      (<= 0 position)
		      (< position (bytevector-length buffer)))))
	    ;; The buffer will be restored at the call to
	    ;; 'return/done' or 'return/overly-small'.
	    (set-buffer! #f)
	    (continue buffer position bv offset length)))

    (define (add-from-port! tok port handle/message return/overly-small
			    return/done-eof return/premature-eof)
      "Keep reading data from the input port @var{port}, feeding them
to the tokeniser @var{tok}.

The procedures @var{handle/message}, and @var{return/overly-small} are used
as in @code{add-bytevector!}.  When the end of file has been reached, and
@var{tok} doesn't hold a partial message, the thunk @var{return/done-eof}
is called in tail position.  When the end of file has been reached, and
@var{tok} does still hold a partial message, the thunk
@var{return/premature-eof} is instead called in tail position.

As with @ode{add-bytevector!}, @code{&kaput-tokeniser-error} and
@code{&interrupted-tokeniser-violation} can be raised.

This is a blocking operation!."
      ;; Cheaty, but it works!  I'd presume Guile or glibc have an
      ;; optimisation for copying a memory region to itself.  Also,
      ;; this saves a buffer allocation.
      (let^ ((! buffer (tokeniser-buffer tok))
	     (! position (tokeniser-position tok))
	     (? (eq? buffer #t)
		(raise (condition
			(make-who-condition 'add-from-port!)
			(make-kaput-tokeniser-error))))
	     (? (eq? buffer #f)
		(raise (condition
			(make-who-condition 'add-from-port!)
			(make-interrupted-tokeniser-violation))))
	     (! length (- (bytevector-length buffer) position))
	     (! n/read (get-bytevector-some! port buffer position length))
	     (? (eof-object? n/read)
		;; If 'position' is 0, then there was no incomplete
		;; message in the tokeniser.
		((if (= position 0) return/done-eof return/premature-eof)))
	     (! (return/add-bytevector!-done)
		(add-from-port! tok port handle/message return/overly-small
				return/done-eof return/premature-eof)))
	    (add-bytevector! tok buffer position n/read handle/message
			     return/add-bytevector!-done
			     return/overly-small)))))
